# load all participant raw data and select just vars of interestjaccard_df <-read.csv("./data/processed/jaccard_data.csv") |>select(PID, transcript_id, new_topic, scaled_turn_id)
# reshape data to get list of topic switches for each PID# should be 1505 aka number of participantsjaccard_df <- jaccard_df |>filter(!is.na(new_topic)) |>select(PID, transcript_id, scaled_turn_id) |>group_by(PID, transcript_id) |>summarize(topic_switch_turns =list(scaled_turn_id), .groups ="drop")
# define jaccard similarity function# window of 3 turnsjaccard_similarity_3 <-function(turns_A, turns_B) {# create sets of turns within 3-turn window set_A <-unlist(lapply(turns_A, function(x) seq(x -3, x +3, by =1))) set_B <-unlist(lapply(turns_B, function(x) seq(x -3, x +3, by =1)))# jaccard similarity calculation intersection <-length(intersect(set_A, set_B)) union <-length(unique(c(set_A, set_B)))# return scorereturn(intersection / union)}# window of 5 turnsjaccard_similarity_5 <-function(turns_A, turns_B) {# create sets of turns within 3-turn window set_A <-unlist(lapply(turns_A, function(x) seq(x -5, x +5, by =1))) set_B <-unlist(lapply(turns_B, function(x) seq(x -5, x +5, by =1)))# jaccard similarity calculation intersection <-length(intersect(set_A, set_B)) union <-length(unique(c(set_A, set_B)))# return scorereturn(intersection / union)}
3 turns
# calculate jaccard similarity for each transcript# and mean score + SDjaccard_df |>group_by(transcript_id) |>expand(PID1 = PID, PID2 = PID) |>filter(PID1 != PID2) |>left_join(jaccard_df, by =c("transcript_id", "PID1"="PID")) |>left_join(jaccard_df, by =c("transcript_id", "PID2"="PID")) |>rowwise() |>mutate(jaccard_score =jaccard_similarity_3(topic_switch_turns.x, topic_switch_turns.y)) |>select(transcript_id, jaccard_score, PID1, PID2) |>ungroup() |>group_by(transcript_id) |>mutate(transcript_average_jaccard =mean(jaccard_score)) |>ungroup() |>summarize(mean_jaccard_score =mean(transcript_average_jaccard),sd_jaccard_score =sd(transcript_average_jaccard))
# make an edges data frame (edge list)edges <- transition_matrix |>select(from = prior_topic, to = current_topic, probability) |>distinct()# make a graph object form the edge listgraph <-as_tbl_graph(edges, directed =TRUE)# add topic base rate as node attribute to graphV(graph)$prior_sum <- transition_matrix |>select(prior_topic, prior_sum) |>distinct() |>pull(prior_sum)# create scaled version of probability for visualizationE(graph)$weight <- transition_matrix$probability# create edges_10 tibbleedges <-as_tibble(igraph::as_data_frame(graph, what ="edges"))# map node names to each edgeedges <- edges |>mutate(edge_color =as.factor(from))graph <- graph |>activate(edges) |>mutate(edge_color = edges$edge_color) |>filter(weight >=0.05)graph <- graph %>%mutate(edge_group =as.factor(seq_along(E(graph))))
# now make a version with themes# make an edges data frame (edge list)edges <- theme_matrix |>select(from = prior_topic, to = current_topic, probability) |>distinct()# make a graph object form the edge listgraph <-as_tbl_graph(edges, directed =TRUE)# add topic base rate as node attribute to graphV(graph)$prior_sum <- theme_matrix |>select(prior_topic, prior_sum) |>distinct() |>pull(prior_sum)# create scaled version of probability for visualizationE(graph)$weight <- theme_matrix$probability# create edges_10 tibbleedges <-as_tibble(igraph::as_data_frame(graph, what ="edges"))# map node names to each edgeedges <- edges |>mutate(edge_color =as.factor(from))graph <- graph |>activate(edges) |>mutate(edge_color = edges$edge_color) |>filter(weight >=0.125)graph <- graph %>%mutate(edge_group =as.factor(seq_along(E(graph))))
What proportion of transcripts are in a given cluster topic at a given scaled time point?
# calculate occurrence of each cluster label at each scaled_turn_id across 200 conversations# prop = occurrence / 200topic_occ <- transcripts_clusters |>select(PID, transcript_id, cluster_label, scaled_turn_id, group_theme) |>distinct() |>group_by(transcript_id, scaled_turn_id, cluster_label, group_theme) |># create binary indicator of whether a transcript contains the cluster label per turnsummarize(label_present =as.integer(any(PID == PID)), .groups ="drop") |>ungroup() |>group_by(scaled_turn_id, cluster_label, group_theme) |># count how many unique transcripts have each label per turnsummarize(transcript_count =sum(label_present), .groups ="drop") |>ungroup() |># now calculate proportion of transcript count out of total transcripts (200)mutate(prop = transcript_count /200,scaled_turn_id = scaled_turn_id*100)
# 200 conversations, 200 topics at each bin point, cluster count = 200 for each time point# Majority across participants who annotated that conversation; in case of tie pick randomly between them# define empty data frametranscript_majority_clusters <-data.frame()# loop through each unique transcript, and figure out majority cluster label per time binfor (i inunique(transcripts_clusters$transcript_id)) {# select just this loop's transcript from transcripts_clusters this_transcript <- transcripts_clusters |>filter(transcript_id == i)# now loop through each scaled integer bin for (j inunique(this_transcript$scaled_turn_id)) {# select just this loop's bin this_bin <- this_transcript |>filter(scaled_turn_id == j)# count participant-level cluster labels participant_count <- this_bin |>group_by(PID) |>summarize(participant_cluster_label =unique(cluster_label)) |>ungroup() |>group_by(participant_cluster_label) |>mutate(cluster_count =length(participant_cluster_label)) |># select only label and countselect(participant_cluster_label, cluster_count) |>distinct() |>arrange(-cluster_count) |>ungroup()# take top row cluster if only one winner, if not, randomly take one of top labels if tie majority_cluster <- participant_count |>slice_max(order_by = cluster_count, n =1) # may return more than 1 if tie# randomly sample from rows of majority cluster this_cluster_label <-sample(majority_cluster$participant_cluster_label, 1)# takes top row cluster label if only one majority cluster label# finally, save the bin, transcript ID, and cluster label for entropy analysis save <-data.frame(bin = j,transcript_id = i,majority_cluster_label = this_cluster_label)# rbind with transcript_majority_clusters to save for all time points and all transcripts transcript_majority_clusters <-rbind(transcript_majority_clusters, save) }}
# now compute the percentage appearance of each majority cluster topic across transcripts per time bintopic_prop_majority <- transcript_majority_clusters |>group_by(bin, majority_cluster_label) |>summarize(cluster_count =length(transcript_id)) |>ungroup() |>group_by(bin) |>mutate(bin_count =sum(cluster_count)) |>mutate(prop = cluster_count / bin_count) |>ungroup()
`summarise()` has grouped output by 'bin'. You can override using the `.groups`
argument.
# load annotation data df_ann <-read.csv("./data/processed/dense_subset_processed.csv") |>filter(!is.na(PID))
# within annotation data, mark participants as coarse, middle, or granular annotators based on the lower, middle, and upper thirds of annotations provided# calculate number of annotations per participantannotation_numbers <- df_ann |> dplyr::group_by(PID) |>summarize(total_PID_annotations =length(turn_id)) # should be 1505, number of participants# also create mutated variable in df_anndf_ann <- df_ann |> dplyr::group_by(PID) |>mutate(total_PID_annotations =length(turn_id)) |>ungroup()# calculate quantiles to split number of PID annotations into three groupsquantiles <-quantile(annotation_numbers$total_PID_annotations,probs =c(0, 1/3, 2/3, 1))# create new variable in df_ann df_ann$annotation_behavior <-cut(df_ann$total_PID_annotations, breaks = quantiles,include.lowest =TRUE, labels =c("coarse", "middle", "granular"))# remove data not neededrm(annotation_numbers, quantiles)
Create an annotation matching function. Examine annotation points for all participants and determine if a given tiled window contains the utterance they marked as signifying a topic shift in conversation. If the utterance they selected is within the window, mark it with “yes” and if not, mark it with a “no”. Do this for all participants separately.
# write a function to check if annotation turn ID is within a tiled window of utterancesdetect_window_annotations <-function(tiling_df, annotation_df) {# save data frames for output annotation_output <-data.frame()# select one participant's annotations at a timefor (a inunique(annotation_df$PID)) {# save PID this_PID <- a# subset annotation DF to just this participant's annotations this_annotation <- annotation_df |>filter(PID == a)# get corresponding transcript they annotated from tiling DF this_transcript_id <-unique(this_annotation$transcript_id) this_transcript <- tiling_df |>filter(transcript_id == this_transcript_id)# save PID annotation behavior behavior <-unique(this_annotation$annotation_behavior)# add new variables to this_transcript to hold this participant's annotated turn / label this_transcript$annotated_turn <-NA this_transcript$annotated_label <-NA# create a list of this participant's labeled topics and their turn IDs PID_labels <- this_annotation$new_topic PID_turns <- this_annotation$turn_id# save a version of this_transcript for looping through annotations annotations_result <- this_transcript# add participant ID and annotation behavior annotations_result$PID <- this_PID annotations_result$annotation_behavior <- behavior# 1a) does the gap turn (i.e., A_turn_end) == topic label turn selected? annotations_result$annotated_turn <-ifelse(annotations_result$A_end_turn %in% PID_turns,"yes", "no")# 1b) if yes, add the label provided by participantsfor (c in1:length(PID_labels)) { annotations_result$annotated_label[annotations_result$A_end_turn == PID_turns[c]] <- PID_labels[c] }# add to annotation output data frame annotation_output <-rbind(annotation_output, annotations_result) }return(annotation_output)}
# plotggplot() +geom_tile(data = example_coarse, aes(x = scaled_turn_id*100, y =0.5, fill = cluster_label)) +geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_topic_utterance_similarity),linetype ="solid") +geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_topic_cluster_similarity),linetype ="dashed") +geom_line(data = example_coarse, aes(x = scaled_turn_id*100, y = avg_cluster_utterance_similarity),linetype ="dotdash", color ="black") +# geom_vline(data = example_coarse[!is.na(example_coarse$switch_cluster),],# aes(xintercept = (scaled_turn_id*100)-0.5, color = cluster_label),# color = "white", linetype = "dotted") +theme_cowplot() +scale_fill_manual(values=met.brewer("Hiroshige", 7)) +labs(x ="Conversation Completion (%)", y =NULL,title ="Coarse Annotator") +scale_x_continuous(expand =c(0, 0)) +scale_y_continuous(expand =c(0, 0)) +theme(legend.position ="bottom",plot.title =element_text(hjust =0.5))
# plotggplot() +geom_tile(data = example_granular, aes(x = scaled_turn_id*100, y =0.5, fill = cluster_label)) +geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_topic_utterance_similarity),linetype ="solid") +geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_topic_cluster_similarity),linetype ="dashed") +geom_line(data = example_granular, aes(x = scaled_turn_id*100, y = avg_cluster_utterance_similarity),linetype ="dotdash", color ="black") +theme_cowplot() +scale_fill_manual(values=met.brewer("Hiroshige", 15)) +labs(x ="Conversation Completion (%)", y =NULL,title ="Granular Annotator") +scale_x_continuous(expand =c(0, 0)) +scale_y_continuous(expand =c(-0.1, 0), limits =c(-0.1,1.2)) +theme(legend.position ="bottom",plot.title =element_text(hjust =0.5))
# get tiled semantic similarity from this transcript onlyexample_tile <-read.csv("./data/output/annotated_transcripts_tile_10_0.csv") |>filter(transcript_id == transcript)example_annotation <-read.csv("./data/processed/dense_subset_processed.csv") |>filter(PID == PID_coarse)# apply function to see where annotations fall relative to transcriptexample_tile <-detect_window_annotations(example_tile, example_annotation)# match with cluster labelsexample_cluster <- transcripts_clusters |>filter(PID == PID_coarse) |>select(cluster_label, "annotated_label"= new_topic) |>distinct()# mergeexample_tile <-merge(example_tile, example_cluster, by ='annotated_label', all.x =TRUE)